
*-----------------------------------------------------
*  load data if there is no data in memory
*-----------------------------------------------------

count
local has_data = `r(N)'>0
display "Indicator whether there is data in memory: "`has_data'

if `has_data'==0 { // load data if there is none

	clear
	set more off
	version 12
	
	use "$PFWdata/Alaska/Sample Database/Final/txns_sample.dta"

}


***********************************************
* Start of filter
***********************************************


//identify and drop duplicate accounts
//deletes duplicate observations ONLY IF genericXXXXdescription is missing (i.e., == "")
duplicates drop userid year month day amount if genericXXXXdescription == "", force
//(32664 observations deleted)

///////////////////////////////////////////////
//delete observations with single duplicates//
//////////////////////////////////////////////

//flag observations without description
g flag_nodesc = genericXXXXdescription == ""

//tag observations with the same userid year month day amount
duplicates tag userid year month day amount, generate(mask_dup)

//replace no description with "zzz" (useful in the next line when we sort
//based on generic description => observations with no description will always
//be the last ones
replace genericXXXXdescription = "zzz" if genericXXXXdescription == ""
sort userid year month day amount genericXXXXdescription 

//tag observations that are duplicates in terms of userid year month day amount
//and that have a different accountid (remember that with this part of the procedure
//our goal is to delete only those observations that are duplicates in pairs)
by userid year month day amount: g foo = 1 if mask_dup >= 1 & accountid != accountid[1]
drop if mask_dup == 1 & foo == 1
//(831314 observations deleted)

//actually not only in pairs but also those that are not in pairs but have no
//description (the procedure that deletes the observations with mask_dup > 1
//also uses generic description as a control... of course it does not work
//if there is no description)
drop if mask_dup > 1 & foo == 1 & flag_nodesc == 1
//(28633 observations deleted)

////////////////////////////////////////////////
//delete observations with multiple duplicates//
///////////////////////////////////////////////
//number of accounts per unique transaction (in terms of userid year month day amount genericXXXXdescription)
bys userid year month day amount genericXXXXdescription accountid: g naccounts = _N
//identify the account number that recurs more often per unique transaction 
//(in terms of userid year month day amount genericXXXXdescription)
bys userid year month day amount genericXXXXdescription: egen max_naccounts = max(naccounts)
//generate a dummy foo2 if the observation is duplicate in terms of userid year month day amount
//(which is the same criterion as above) and the accountid is not the one that 
//recurs more often. Note that here we deal only with observations that have more
//than one replication.
//Here, we use again the duplicate criterion userid year month day amount because sometimes
//the observations are clear duplicates but the description is slightly different
by userid year month day amount: g foo2 = 1 if mask_dup > 1 & naccounts != max_naccounts
drop if mask_dup > 1 & foo2 == 1
//(14869 observations deleted)

//generate a new duplicate mask. It takes value 1 when observations are the same
//in terms of userid year month day amount genericXXXXdescription
duplicates tag userid year month day amount genericXXXXdescription, generate(mask_dup2)
//generate a dummy that takes value 1 if naccount < the number of duplicates (note that
//if, e.g., there are three observations that are the same mask_dup2 takes value 2) AND
//the account is different from the first one.
bys userid year month day amount genericXXXXdescription: g foo3 = 1 if naccount < mask_dup2+1 & accountid != accountid[1]
drop if foo3 == 1
//(312143 observations deleted)

//clean up all the temporary variables and changes
replace genericXXXXdescription = "" if genericXXXXdescription == "zzz"
drop foo foo2 foo3 mask_dup2 mask_dup naccounts max_naccounts

//////////////////////////////////////////////
//delete duplicates with the same accountid//
/////////////////////////////////////////////
//drop if same transaction in terms of userid year month day amount genericXXXXdescription accountid 
//and transaction amount < -100 and the transaction is neither classified as Enterteinment nor as Travel.
///The idea is that it is possible that someone buys, say, multiple coffees
//in a day.
duplicates drop userid year month day amount genericXXXXdescription accountid if amount < -100 & !(inferredcategory > 100  & inferredcategory < 106) ///
																							   & !(inferredcategory > 1500 & inferredcategory < 1505) , force
//(55712 observations deleted)

//drop if same transaction in terms of userid year month day amount genericXXXXdescription accountid,
//category == 3001 (payroll) and amount > 1000. The idea is that you might receive two times a small amount the same
//transfer that XXXX recognizes as payroll (there is an example of someone in Reno that has multiple small
//payroll transfers in one day (tips?). More fishy if the amount is big.
duplicates drop userid year month day amount genericXXXXdescription accountid if inferredcategoryid == 3001 & amount > 1000, force
//(1116 observations deleted)


***********************************************
* End of filter
***********************************************



*-----------------------------------------------------
*  save data if there was no data in memory
*-----------------------------------------------------

if `has_data'==0 {

	save "$PFWdata/Alaska/Sample Database/Final/stata_eb/txns_sample_wo_duplicates.dta", replace
}
